In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv('F:\Documents\GitHub\Rapido_Dataset.csv')
In [4]:
df.head()
Out[4]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 |
In [5]:
import pandas as pd
# Convert Unix timestamp to a readable date and time
df['date_time'] = pd.to_datetime(df['timestamp'], unit='ms')
# Display the first few rows to see the conversion
print(df[['timestamp', 'date_time']].head())
timestamp date_time 0 1.546709e+12 2019-01-05 17:27:50.211 1 1.546709e+12 2019-01-05 17:28:29.524 2 1.546709e+12 2019-01-05 17:28:51.857 3 1.546709e+12 2019-01-05 17:29:18.403 4 1.546709e+12 2019-01-05 17:29:46.884
In [6]:
df
Out[6]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.211 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.524 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53421 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53422 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53423 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53424 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53425 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
53426 rows × 11 columns
In [7]:
import pandas as pd
# Assume df is your DataFrame and you have already done the cleaning steps
# Convert Unix timestamp to a readable date and time
df['date_time'] = pd.to_datetime(df['timestamp'], unit='ms')
# Display the cleaned DataFrame to verify the new column
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare date_time
0 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.211
1 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.524
2 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857
3 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403
4 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884
In [9]:
import pandas as pd
# Display all rows that contain any None (NaN) values
none_values = df[df.isnull().any(axis=1)]
# Display the rows with None values
print(none_values)
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng \
44587 NaN NaN NaN NaN NaN NaN NaN
44588 NaN NaN NaN NaN NaN NaN NaN
44589 NaN NaN NaN NaN NaN NaN NaN
44590 NaN NaN NaN NaN NaN NaN NaN
44591 NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ...
53421 NaN NaN NaN NaN NaN NaN NaN
53422 NaN NaN NaN NaN NaN NaN NaN
53423 NaN NaN NaN NaN NaN NaN NaN
53424 NaN NaN NaN NaN NaN NaN NaN
53425 NaN NaN NaN NaN NaN NaN NaN
travel_distance travel_time trip_fare date_time
44587 NaN NaN NaN NaT
44588 NaN NaN NaN NaT
44589 NaN NaN NaN NaT
44590 NaN NaN NaN NaT
44591 NaN NaN NaN NaT
... ... ... ... ...
53421 NaN NaN NaN NaT
53422 NaN NaN NaN NaT
53423 NaN NaN NaN NaT
53424 NaN NaN NaN NaT
53425 NaN NaN NaN NaT
[8839 rows x 11 columns]
In [10]:
# Calculate the number of null values in each column
null_counts = df.isnull().sum()
# Display the number of null values for each column
print(null_counts)
trip_id 8839 customer_id 8839 timestamp 8839 pick_lat 8839 pick_lng 8839 drop_lat 8839 drop_lng 8839 travel_distance 8839 travel_time 8839 trip_fare 8839 date_time 8839 dtype: int64
In [11]:
import pandas as pd
# Convert Unix timestamp to a readable date and time by dividing by 1000
df['date_time'] = pd.to_datetime(df['timestamp'] / 1000, unit='s')
# Display the first few rows to verify the conversion
print(df[['timestamp', 'date_time']].head())
timestamp date_time 0 1.546709e+12 2019-01-05 17:27:50.210999966 1 1.546709e+12 2019-01-05 17:28:29.523999929 2 1.546709e+12 2019-01-05 17:28:51.857000113 3 1.546709e+12 2019-01-05 17:29:18.403000116 4 1.546709e+12 2019-01-05 17:29:46.884000063
In [12]:
df
Out[12]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53421 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53422 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53423 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53424 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
| 53425 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT |
53426 rows × 11 columns
In [13]:
# Split the 'date_time' column into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time
# Display the updated DataFrame to verify the new columns
print(df[['date_time', 'date', 'time']].head())
date_time date time 0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 1 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 2 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 3 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 4 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000
In [14]:
# Drop all rows with null values
df = df.dropna()
# Display the updated DataFrame to verify the changes
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time date time
0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999
1 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999
2 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000
3 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000
4 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000
In [15]:
df
Out[15]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | date | time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 2019-01-05 | 17:27:50.210999 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 2019-01-05 | 17:28:29.523999 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 2019-01-05 | 17:28:51.857000 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 2019-01-05 | 17:29:18.403000 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 2019-01-05 | 17:29:46.884000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 2019-01-03 | 16:04:51.043999 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 2019-01-03 | 16:05:05.868999 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 2019-01-03 | 16:05:22.102999 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 2019-01-03 | 16:05:36.428999 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 2019-01-03 | 16:05:49.937999 |
44587 rows × 13 columns
In [16]:
# Extract the year from the 'date_time' column and create a new 'year' column
df['year'] = df['date_time'].dt.year
# Display the DataFrame to verify the new column
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time date time year
0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 2019
1 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 2019
2 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 2019
3 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 2019
4 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000 2019
In [17]:
df
Out[17]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | date | time | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 2019-01-05 | 17:27:50.210999 | 2019 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 2019-01-05 | 17:28:29.523999 | 2019 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 2019-01-05 | 17:28:51.857000 | 2019 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 2019-01-05 | 17:29:18.403000 | 2019 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 2019-01-05 | 17:29:46.884000 | 2019 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 2019-01-03 | 16:04:51.043999 | 2019 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 2019-01-03 | 16:05:05.868999 | 2019 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 2019-01-03 | 16:05:22.102999 | 2019 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 2019-01-03 | 16:05:36.428999 | 2019 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 2019-01-03 | 16:05:49.937999 | 2019 |
44587 rows × 14 columns
In [18]:
# Drop the 'year' column
df = df.drop(columns=['year'])
# Display the DataFrame to verify the column is removed
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time date time
0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999
1 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999
2 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000
3 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000
4 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000
In [19]:
df
Out[19]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | date | time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 2019-01-05 | 17:27:50.210999 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 2019-01-05 | 17:28:29.523999 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 2019-01-05 | 17:28:51.857000 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 2019-01-05 | 17:29:18.403000 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 2019-01-05 | 17:29:46.884000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 2019-01-03 | 16:04:51.043999 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 2019-01-03 | 16:05:05.868999 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 2019-01-03 | 16:05:22.102999 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 2019-01-03 | 16:05:36.428999 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 2019-01-03 | 16:05:49.937999 |
44587 rows × 13 columns
In [20]:
import pandas as pd
# Extract month from 'date_time' column
df['month'] = df['date_time'].dt.month
# Create separate DataFrames for pickup and drop-off locations
pickup_data = df[['month', 'pick_lat', 'pick_lng']]
dropoff_data = df[['month', 'drop_lat', 'drop_lng']]
# Rename columns for consistency
pickup_data.columns = ['month', 'latitude', 'longitude']
dropoff_data.columns = ['month', 'latitude', 'longitude']
In [21]:
df
Out[21]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | date | time | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 2019-01-05 | 17:27:50.210999 | 1 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 2019-01-05 | 17:28:29.523999 | 1 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 2019-01-05 | 17:28:51.857000 | 1 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 2019-01-05 | 17:29:18.403000 | 1 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 2019-01-05 | 17:29:46.884000 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 2019-01-03 | 16:04:51.043999 | 1 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 2019-01-03 | 16:05:05.868999 | 1 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 2019-01-03 | 16:05:22.102999 | 1 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 2019-01-03 | 16:05:36.428999 | 1 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 2019-01-03 | 16:05:49.937999 | 1 |
44587 rows × 14 columns
In [22]:
import folium
from folium.plugins import HeatMap
# Function to create a heatmap for a given month
def create_heatmap(data, month, map_title):
m = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=12)
HeatMap(data[data['month'] == month][['latitude', 'longitude']].values, radius=10).add_to(m)
m.save(f'{map_title}_month_{month}.html')
# Create heatmaps for each month
for month in pickup_data['month'].unique():
create_heatmap(pickup_data, month, 'Pickup_Hotspots')
create_heatmap(dropoff_data, month, 'Dropoff_Hotspots')
In [23]:
!pip install folium
Requirement already satisfied: folium in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (0.18.0) Requirement already satisfied: branca>=0.6.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (0.8.0) Requirement already satisfied: jinja2>=2.9 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (3.1.4) Requirement already satisfied: numpy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2.1.2) Requirement already satisfied: requests in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2.32.3) Requirement already satisfied: xyzservices in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2024.9.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from jinja2>=2.9->folium) (3.0.1) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (3.4.0) Requirement already satisfied: idna<4,>=2.5 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (2024.8.30)
In [24]:
import pandas as pd
# Assuming df is your DataFrame and it has a column 'date_time' in datetime format
# Extract the date part (without year, month, or time)
df['date_only'] = df['date_time'].dt.strftime('%d-%m')
# Display all unique dates
unique_dates = df['date_only'].unique()
print("Unique Dates:", unique_dates)
Unique Dates: ['05-01' '03-01' '02-01' '07-01' '06-01' '04-01' '01-01']
In [25]:
# Drop the 'month' column
df = df.drop(columns=['month'])
# Display the DataFrame to verify the column is removed
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time date time date_only
0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 05-01
1 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 05-01
2 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 05-01
3 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 05-01
4 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000 05-01
In [26]:
df
Out[26]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | date | time | date_only | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 2019-01-05 | 17:27:50.210999 | 05-01 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 2019-01-05 | 17:28:29.523999 | 05-01 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 2019-01-05 | 17:28:51.857000 | 05-01 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 2019-01-05 | 17:29:18.403000 | 05-01 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 2019-01-05 | 17:29:46.884000 | 05-01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 2019-01-03 | 16:04:51.043999 | 03-01 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 2019-01-03 | 16:05:05.868999 | 03-01 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 2019-01-03 | 16:05:22.102999 | 03-01 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 2019-01-03 | 16:05:36.428999 | 03-01 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 2019-01-03 | 16:05:49.937999 | 03-01 |
44587 rows × 14 columns
In [27]:
# Drop the 'date' column
df = df.drop(columns=['date'])
# Rename 'date_only' to 'date'
df = df.rename(columns={'date_only': 'date'})
# Display the DataFrame to verify the changes
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date
0 2019-01-05 17:27:50.210999966 17:27:50.210999 05-01
1 2019-01-05 17:28:29.523999929 17:28:29.523999 05-01
2 2019-01-05 17:28:51.857000113 17:28:51.857000 05-01
3 2019-01-05 17:29:18.403000116 17:29:18.403000 05-01
4 2019-01-05 17:29:46.884000063 17:29:46.884000 05-01
In [28]:
df
Out[28]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 05-01 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 05-01 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 05-01 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 05-01 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 05-01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 03-01 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 03-01 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 03-01 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 03-01 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 03-01 |
44587 rows × 13 columns
In [29]:
# Count the number of unique values in the 'trip_id' column
unique_trip_ids = df['trip_id'].nunique()
# Display the number of unique trip IDs
print(f'Number of unique trip IDs: {unique_trip_ids}')
Number of unique trip IDs: 44587
In [30]:
import pandas as pd
# Extract the date from the 'date_time' column
df['date'] = df['date_time'].dt.date
# Create separate DataFrames for pickup and drop-off locations
pickup_data = df[['date', 'pick_lat', 'pick_lng']]
dropoff_data = df[['date', 'drop_lat', 'drop_lng']]
# Rename columns for consistency
pickup_data.columns = ['date', 'latitude', 'longitude']
dropoff_data.columns = ['date', 'latitude', 'longitude']
In [31]:
df
Out[31]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 |
44587 rows × 13 columns
In [32]:
import folium
from folium.plugins import HeatMap
# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
m = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=12)
HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
m.save(f'{map_title}_day_{day}.html')
# Create heatmaps for each date
for day in pickup_data['date'].unique():
create_heatmap(pickup_data, day, 'Pickup_Hotspots')
create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
In [33]:
import folium
from folium.plugins import HeatMap
# Average location to center the map around Telangana
center_lat = 17.123184
center_lng = 79.208824
# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
m = folium.Map(location=[center_lat, center_lng], zoom_start=10) # Adjust zoom_start for optimal view
HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
m.save(f'{map_title}_day_{day}.html')
return m
# Generate and display heatmaps for each day
for day in pickup_data['date'].unique():
pickup_map = create_heatmap(pickup_data, day, 'Pickup_Hotspots')
dropoff_map = create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
# Display the maps in Jupyter Notebook
display(pickup_map)
display(dropoff_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
In [34]:
import folium
from folium.plugins import HeatMap
# Average location to center the map around Telangana
center_lat = 17.123184
center_lng = 79.208824
# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
m = folium.Map(location=[center_lat, center_lng], zoom_start=10) # Adjust zoom_start for optimal view
HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
m.save(f'{map_title}_day_{day}.html')
return m
# Generate and display heatmaps for each day
for day in pickup_data['date'].unique():
pickup_map = create_heatmap(pickup_data, day, 'Pickup_Hotspots')
dropoff_map = create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
# Display the maps in Jupyter Notebook
display(pickup_map)
display(dropoff_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
In [35]:
import pandas as pd
import matplotlib.pyplot as plt
# Assume df is your DataFrame and contains a 'date' column and 'pick_lat' for pickup locations
# Group data by date and count the number of bookings per date
bookings_per_day = df.groupby('date')['pick_lat'].count()
# Plot the bar chart
plt.figure(figsize=(8, 4))
plt.bar(bookings_per_day.index, bookings_per_day.values, color='skyblue')
plt.xlabel('Date', fontsize=15)
plt.ylabel('Number of Bookings', fontsize=15)
plt.title('Number of Bookings by Date', fontsize=18)
plt.xticks(rotation=45)
plt.show()
In [36]:
df
Out[36]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 |
44587 rows × 13 columns
In [37]:
import pandas as pd
# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
# Group data by hour and count the number of bookings per hour
bookings_per_hour = df.groupby('hour')['pick_lat'].count()
# Plot the bar chart
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 8))
plt.bar(bookings_per_hour.index, bookings_per_hour.values, color='skyblue')
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Number of Bookings', fontsize=15)
plt.title('Number of Bookings by Hour', fontsize=18)
plt.xticks(range(0, 24))
plt.show()
In [38]:
import pandas as pd
# Assume df is your DataFrame and contains 'date_time', 'pick_lat', 'pick_lng' columns
# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
# Define a function to identify the top hours for each hotspot location
def get_hotspot_hours(data, location_col, lat_col, lng_col):
# Group by location and hour, then count the number of bookings
hotspot_hours = data.groupby([lat_col, lng_col, 'hour'])[location_col].count().reset_index()
hotspot_hours.columns = [lat_col, lng_col, 'hour', 'bookings']
# Find the hour with the maximum bookings for each location
max_bookings_per_hour = hotspot_hours.loc[hotspot_hours.groupby([lat_col, lng_col])['bookings'].idxmax()]
return max_bookings_per_hour
# Get hotspot hours for pickup locations
pickup_hotspot_hours = get_hotspot_hours(df, 'trip_id', 'pick_lat', 'pick_lng')
# Display the results
print(pickup_hotspot_hours)
pick_lat pick_lng hour bookings 0 17.330339 78.530975 14 1 1 17.334276 78.532150 8 1 2 17.334480 78.549294 14 1 3 17.334595 78.552040 13 1 4 17.334942 78.569489 6 1 ... ... ... ... ... 44574 17.528427 78.388962 15 1 44575 17.528849 78.385376 6 1 44576 17.528875 78.385345 8 1 44577 17.528885 78.385391 14 1 44578 17.529791 78.434471 4 1 [44541 rows x 4 columns]
In [40]:
import pandas as pd
# Group by latitude and longitude to count the number of bookings per location
location_bookings = df.groupby(['pick_lat', 'pick_lng']).size().reset_index(name='number_of_bookings')
# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
In [41]:
# Define a function to find the peak booking hour for each location
def peak_booking_hour(data, lat_col, lng_col, hour_col):
# Group by location and hour, then count the number of bookings
hourly_bookings = data.groupby([lat_col, lng_col, hour_col]).size().reset_index(name='hourly_bookings')
# Find the hour with the maximum bookings for each location
peak_hours = hourly_bookings.loc[hourly_bookings.groupby([lat_col, lng_col])['hourly_bookings'].idxmax()]
return peak_hours
# Get peak booking hours for each location
peak_hours = peak_booking_hour(df, 'pick_lat', 'pick_lng', 'hour')
# Merge the booking counts with the peak hours
location_summary = pd.merge(location_bookings, peak_hours, on=['pick_lat', 'pick_lng'])
location_summary.columns = ['latitude', 'longitude', 'number_of_bookings', 'peak_booking_hour', 'hourly_bookings']
# Display the results
print(location_summary)
latitude longitude number_of_bookings peak_booking_hour \
0 17.330339 78.530975 1 14
1 17.334276 78.532150 1 8
2 17.334480 78.549294 1 14
3 17.334595 78.552040 1 13
4 17.334942 78.569489 1 6
... ... ... ... ...
44536 17.528427 78.388962 1 15
44537 17.528849 78.385376 1 6
44538 17.528875 78.385345 1 8
44539 17.528885 78.385391 1 14
44540 17.529791 78.434471 1 4
hourly_bookings
0 1
1 1
2 1
3 1
4 1
... ...
44536 1
44537 1
44538 1
44539 1
44540 1
[44541 rows x 5 columns]
In [42]:
import pandas as pd
# Assuming df is your DataFrame and it has 'pick_lat' and 'pick_lng' columns
# Extract unique pickup locations
unique_locations = df[['pick_lat', 'pick_lng']].drop_duplicates().reset_index(drop=True)
# Display the unique locations
print(unique_locations)
pick_lat pick_lng 0 17.442705 78.387878 1 17.490189 78.415512 2 17.370108 78.515045 3 17.439314 78.443001 4 17.432325 78.381966 ... ... ... 44536 17.443661 78.391968 44537 17.439289 78.396118 44538 17.363689 78.535194 44539 17.401539 78.570076 44540 17.385243 78.479896 [44541 rows x 2 columns]
In [43]:
import pandas as pd
# Assuming df is your DataFrame and it has 'drop_lat' and 'drop_lng' columns
# Extract unique drop-off locations
unique_drop_locations = df[['drop_lat', 'drop_lng']].drop_duplicates().reset_index(drop=True)
# Display the unique drop-off locations
print(unique_drop_locations)
drop_lat drop_lng 0 17.457829 78.399056 1 17.450548 78.367294 2 17.377041 78.517921 3 17.397131 78.516586 4 17.401625 78.400032 ... ... ... 44522 17.451042 78.371658 44523 17.449976 78.389160 44524 17.374418 78.529823 44525 17.416904 78.591362 44526 17.394102 78.499550 [44527 rows x 2 columns]
In [44]:
import pandas as pd
# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')
# Find the location with the maximum bookings
max_pickup_location = pickup_counts.loc[pickup_counts['number_of_bookings'].idxmax()]
print(f"Pickup location with the most bookings: {max_pickup_location}")
Pickup location with the most bookings: pick_lat 17.497129 pick_lng 78.386581 number_of_bookings 4.000000 Name: 43913, dtype: float64
In [45]:
# Display rows 43912 to 43920
print(df.iloc[43911:43920])
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
43911 ID43912 CUST_2582 1.546527e+12 17.463076 78.369011 17.445284
43912 ID43913 CUST_12709 1.546527e+12 17.440476 78.479790 17.441366
43913 ID43914 CUST_10049 1.546527e+12 17.406050 78.390785 17.444227
43914 ID43915 CUST_18971 1.546527e+12 17.388184 78.478332 17.436727
43915 ID43916 CUST_3256 1.546527e+12 17.433126 78.446663 17.402319
43916 ID43917 CUST_279 1.546527e+12 17.404165 78.447098 17.427950
43917 ID43918 CUST_12861 1.546527e+12 17.386606 78.480926 17.403498
43918 ID43919 CUST_066 1.546527e+12 17.455437 78.372810 17.436823
43919 ID43920 CUST_1689 1.546527e+12 17.511930 78.364357 17.510769
drop_lng travel_distance travel_time trip_fare \
43911 78.385788 2.648 9.577733 36.0
43912 78.469864 5.445 0.100767 52.0
43913 78.390289 5.275 16.720600 52.0
43914 78.496376 4.042 9.092983 38.0
43915 78.483666 3.281 10.903133 38.0
43916 78.451126 1.191 3.845300 25.0
43917 78.481491 17.848 50.143417 198.0
43918 78.435692 1.894 5.148433 28.0
43919 78.401916 0.202 1.113883 20.0
date_time time date hour
43911 2019-01-03 14:43:55.934999943 14:43:55.934999 2019-01-03 14
43912 2019-01-03 14:44:05.562999964 14:44:05.562999 2019-01-03 14
43913 2019-01-03 14:44:10.555000067 14:44:10.555000 2019-01-03 14
43914 2019-01-03 14:44:30.289000034 14:44:30.289000 2019-01-03 14
43915 2019-01-03 14:44:41.865000010 14:44:41.865000 2019-01-03 14
43916 2019-01-03 14:44:44.759000063 14:44:44.759000 2019-01-03 14
43917 2019-01-03 14:44:52.944000006 14:44:52.944000 2019-01-03 14
43918 2019-01-03 14:44:53.720000029 14:44:53.720000 2019-01-03 14
43919 2019-01-03 14:44:55.555999994 14:44:55.555999 2019-01-03 14
In [46]:
import pandas as pd
# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')
# Filter locations with more than 2 bookings
pickup_counts = pickup_counts[pickup_counts['number_of_bookings'] >= 2]
# Sort by number of bookings and select the top 10 locations
top_10_pickup_locations = pickup_counts.nlargest(10, 'number_of_bookings')
# Display the results
print(top_10_pickup_locations)
pick_lat pick_lng number_of_bookings 43913 17.497129 78.386581 4 43906 17.497105 78.386574 3 965 17.366550 78.551582 2 4706 17.391233 78.489227 2 5827 17.394249 78.517014 2 8360 17.399475 78.509415 2 9486 17.401743 78.560585 2 10726 17.404352 78.464462 2 15176 17.414572 78.507652 2 15784 17.415865 78.407753 2
In [47]:
import pandas as pd
# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')
# Filter locations with more than 1 booking
locations_with_multiple_bookings = pickup_counts[pickup_counts['number_of_bookings'] > 1]
# Display the results
print(locations_with_multiple_bookings)
pick_lat pick_lng number_of_bookings 965 17.366550 78.551582 2 4706 17.391233 78.489227 2 5827 17.394249 78.517014 2 8360 17.399475 78.509415 2 9486 17.401743 78.560585 2 10726 17.404352 78.464462 2 15176 17.414572 78.507652 2 15784 17.415865 78.407753 2 17255 17.420282 78.347046 2 19031 17.425179 78.378700 2 19106 17.425318 78.378311 2 21470 17.430283 78.511963 2 22111 17.431818 78.427940 2 22970 17.433174 78.488373 2 23674 17.434418 78.500732 2 23719 17.434484 78.500893 2 23993 17.434870 78.501007 2 25291 17.436710 78.456528 2 25694 17.437117 78.384895 2 25741 17.437151 78.384941 2 27512 17.439583 78.394836 2 28302 17.440767 78.386002 2 30122 17.442688 78.369636 2 30179 17.442709 78.369942 2 30427 17.442869 78.369850 2 31907 17.444504 78.461517 2 32386 17.445173 78.464653 2 33339 17.446653 78.365685 2 34849 17.448181 78.496559 2 34875 17.448202 78.496521 2 34903 17.448215 78.496574 2 34911 17.448219 78.496551 2 34976 17.448282 78.496635 2 36236 17.450579 78.394325 2 37098 17.452213 78.369850 2 37650 17.453846 78.365990 2 37675 17.453878 78.365959 2 38026 17.454683 78.371643 2 38736 17.456329 78.377350 2 39158 17.457748 78.372345 2 42264 17.480785 78.555748 2 43906 17.497105 78.386574 3 43913 17.497129 78.386581 4
In [48]:
import pandas as pd
# Combine pickup and drop-off locations into a single DataFrame
combined_locations = pd.concat([
df[['pick_lat', 'pick_lng']].rename(columns={'pick_lat': 'latitude', 'pick_lng': 'longitude'}),
df[['drop_lat', 'drop_lng']].rename(columns={'drop_lat': 'latitude', 'drop_lng': 'longitude'})
])
# Remove duplicates to keep only unique locations
unique_locations = combined_locations.drop_duplicates()
# Count the number of bookings for each unique location
location_counts = combined_locations.groupby(['latitude', 'longitude']).size().reset_index(name='number_of_bookings')
# Filter locations with more than 2 bookings
locations_with_multiple_bookings = location_counts[location_counts['number_of_bookings'] >= 2]
# Display the results
print(locations_with_multiple_bookings)
latitude longitude number_of_bookings 175 17.341450 78.529335 2 215 17.343428 78.537392 2 222 17.343584 78.536438 2 232 17.343851 78.534561 2 562 17.350534 78.511032 2 ... ... ... ... 87365 17.498640 78.390190 2 87421 17.498835 78.385956 2 87554 17.499443 78.383965 2 87990 17.506767 78.411156 2 88299 17.516806 78.338860 2 [647 rows x 3 columns]
In [50]:
import pandas as pd
# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')
# Find the customer with the maximum bookings
most_frequent_customer = customer_bookings.loc[customer_bookings['number_of_bookings'].idxmax()]
print(f"Customer with the most bookings: {most_frequent_customer}")
Customer with the most bookings: customer_id CUST_279 number_of_bookings 53 Name: 11208, dtype: object
In [51]:
import pandas as pd
# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')
# Sort by number of bookings and select the top 15 customers
top_15_customers = customer_bookings.nlargest(10, 'number_of_bookings')
# Display the results
print(top_15_customers)
customer_id number_of_bookings 11208 CUST_279 53 11550 CUST_3100 25 12670 CUST_4119 25 1432 CUST_1120 21 2730 CUST_1237 21 10691 CUST_232 21 11438 CUST_2999 20 16521 CUST_762 20 10320 CUST_1982 19 10453 CUST_2103 19
In [68]:
import pandas as pd
# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')
# Calculate the average number of bookings per customer
average_bookings_per_customer = customer_bookings['number_of_bookings'].mean()
print(f'Average number of bookings per customer: {average_bookings_per_customer}')
Average number of bookings per customer: 2.3296410470766498
In [52]:
import pandas as pd
# Identify the trip with the highest travel time
max_travel_time_trip = df.loc[df['travel_time'].idxmax()]
# Extract the relevant details
highest_travel_time = max_travel_time_trip['travel_time']
respective_trip_fare = max_travel_time_trip['trip_fare']
print(f'Highest travel time: {highest_travel_time}')
print(f'Respective trip fare: {respective_trip_fare}')
Highest travel time: 4134.3887 Respective trip fare: 60.0
In [53]:
df
Out[53]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 | 17 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 | 17 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 | 17 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 | 17 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 | 17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 | 16 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 | 16 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 | 16 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 | 16 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 | 16 |
44587 rows × 14 columns
In [54]:
import pandas as pd
# Identify the trip with the highest trip fare
max_trip_fare_trip = df.loc[df['trip_fare'].idxmax()]
# Extract the relevant details
highest_trip_fare = max_trip_fare_trip['trip_fare']
respective_travel_time = max_trip_fare_trip['travel_time']
print(f'Highest trip fare: {highest_trip_fare}')
print(f'Respective travel time: {respective_travel_time}')
Highest trip fare: 1670.0 Respective travel time: 267.825483
In [55]:
import pandas as pd
# Check anomalies in the highest travel time trip
highest_travel_time_trip = df.loc[df['travel_time'].idxmax()]
print(f'Anomalies in the highest travel time trip: \n{highest_travel_time_trip}')
# Check anomalies in the highest trip fare trip
highest_trip_fare_trip = df.loc[df['trip_fare'].idxmax()]
print(f'Anomalies in the highest trip fare trip: \n{highest_trip_fare_trip}')
# Identify outliers using IQR
Q1 = df[['travel_time', 'trip_fare']].quantile(0.25)
Q3 = df[['travel_time', 'trip_fare']].quantile(0.75)
IQR = Q3 - Q1
outliers = df[((df[['travel_time', 'trip_fare']] < (Q1 - 1.5 * IQR)) |(df[['travel_time', 'trip_fare']] > (Q3 + 1.5 * IQR))).any(axis=1)]
print(f'Identified outliers: \n{outliers}')
Anomalies in the highest travel time trip:
trip_id ID10317
customer_id CUST_7419
timestamp 1546583372751.0
pick_lat 17.370081
pick_lng 78.479561
drop_lat 17.393438
drop_lng 78.492004
travel_distance 6.889
travel_time 4134.3887
trip_fare 60.0
date_time 2019-01-04 06:29:32.750999928
time 06:29:32.750999
date 2019-01-04
hour 6
Name: 10316, dtype: object
Anomalies in the highest trip fare trip:
trip_id ID42531
customer_id CUST_18770
timestamp 1546519406927.0
pick_lat 17.425972
pick_lng 78.340454
drop_lat 17.441185
drop_lng 78.391289
travel_distance 9.4
travel_time 267.825483
trip_fare 1670.0
date_time 2019-01-03 12:43:26.927000046
time 12:43:26.927000
date 2019-01-03
hour 12
Name: 42530, dtype: object
Identified outliers:
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
9 ID010 CUST_010 1.546709e+12 17.401064 78.418938 17.434597
30 ID031 CUST_031 1.546710e+12 17.447433 78.379745 17.431671
45 ID046 CUST_046 1.546710e+12 17.406568 78.496925 17.519095
... ... ... ... ... ... ...
44519 ID44520 CUST_1955 1.546531e+12 17.439194 78.448189 17.440910
44520 ID44521 CUST_1446 1.546531e+12 17.400417 78.418739 17.441963
44534 ID44535 CUST_7889 1.546531e+12 17.466257 78.367310 17.500685
44539 ID44540 CUST_19130 1.546531e+12 17.407791 78.445877 17.402725
44580 ID44581 CUST_5830 1.546531e+12 17.402431 78.392143 17.421440
drop_lng travel_distance travel_time trip_fare \
1 78.367294 11.991 24.075200 119.0
3 78.516586 11.822 24.037550 121.0
9 78.501793 13.596 43.543183 142.0
30 78.425896 6.553 52.538500 60.0
45 78.396591 21.267 45.602867 243.0
... ... ... ... ...
44519 78.433937 11.567 28.903767 118.0
44520 78.428490 12.127 66.452333 121.0
44534 78.356903 18.036 43.141267 201.0
44539 78.376396 17.200 41.441917 190.0
44580 78.380241 17.243 39.655350 195.0
date_time time date hour
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
9 2019-01-05 17:30:32.591000080 17:30:32.591000 2019-01-05 17
30 2019-01-05 17:37:15.687999964 17:37:15.687999 2019-01-05 17
45 2019-01-05 17:40:23.943000078 17:40:23.943000 2019-01-05 17
... ... ... ... ...
44519 2019-01-03 15:56:08.453999996 15:56:08.453999 2019-01-03 15
44520 2019-01-03 15:56:16.016000032 15:56:16.016000 2019-01-03 15
44534 2019-01-03 15:57:50.066999911 15:57:50.066999 2019-01-03 15
44539 2019-01-03 15:58:36.032000065 15:58:36.032000 2019-01-03 15
44580 2019-01-03 16:04:24.164000034 16:04:24.164000 2019-01-03 16
[3879 rows x 14 columns]
In [76]:
import pandas as pd
# Calculate Q1, Q3 and IQR
Q1 = df[['travel_time', 'trip_fare']].quantile(0.25)
Q3 = df[['travel_time', 'trip_fare']].quantile(0.75)
IQR = Q3 - Q1
# Adjust the boundaries for a more lenient outlier detection
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR
# Identify outliers with the adjusted boundaries
outliers = df[((df[['travel_time', 'trip_fare']] < lower_bound) | (df[['travel_time', 'trip_fare']] > upper_bound)).any(axis=1)]
# Display the outliers
print(f'Identified outliers: \n{outliers}')
Identified outliers:
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
9 ID010 CUST_010 1.546709e+12 17.401064 78.418938 17.434597
45 ID046 CUST_046 1.546710e+12 17.406568 78.496925 17.519095
69 ID070 CUST_070 1.546711e+12 17.440002 78.395165 17.388193
89 ID090 CUST_089 1.546711e+12 17.412432 78.465210 17.490995
93 ID094 CUST_093 1.546711e+12 17.399290 78.385506 17.444975
... ... ... ... ... ... ...
44514 ID44515 CUST_2245 1.546531e+12 17.488409 78.389084 17.502531
44520 ID44521 CUST_1446 1.546531e+12 17.400417 78.418739 17.441963
44534 ID44535 CUST_7889 1.546531e+12 17.466257 78.367310 17.500685
44539 ID44540 CUST_19130 1.546531e+12 17.407791 78.445877 17.402725
44580 ID44581 CUST_5830 1.546531e+12 17.402431 78.392143 17.421440
drop_lng travel_distance travel_time trip_fare \
9 78.501793 13.596 43.543183 142.0
45 78.396591 21.267 45.602867 243.0
69 78.494682 15.933 33.612183 176.0
89 78.503151 13.595 29.157767 141.0
93 78.393044 9.283 83.198267 87.0
... ... ... ... ...
44514 78.388596 18.626 60.479533 199.0
44520 78.428490 12.127 66.452333 121.0
44534 78.356903 18.036 43.141267 201.0
44539 78.376396 17.200 41.441917 190.0
44580 78.380241 17.243 39.655350 195.0
date_time time date hour
9 2019-01-05 17:30:32.591000080 17:30:32.591000 2019-01-05 17
45 2019-01-05 17:40:23.943000078 17:40:23.943000 2019-01-05 17
69 2019-01-05 17:49:53.155999899 17:49:53.155999 2019-01-05 17
89 2019-01-05 17:57:09.674999952 17:57:09.674999 2019-01-05 17
93 2019-01-05 17:57:54.219000101 17:57:54.219000 2019-01-05 17
... ... ... ... ...
44514 2019-01-03 15:55:33.858999968 15:55:33.858999 2019-01-03 15
44520 2019-01-03 15:56:16.016000032 15:56:16.016000 2019-01-03 15
44534 2019-01-03 15:57:50.066999911 15:57:50.066999 2019-01-03 15
44539 2019-01-03 15:58:36.032000065 15:58:36.032000 2019-01-03 15
44580 2019-01-03 16:04:24.164000034 16:04:24.164000 2019-01-03 16
[1482 rows x 14 columns]
In [57]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df is your DataFrame
# Extract only the relevant columns
data = df[['travel_time', 'trip_fare']]
# Create a box plot to visualize outliers
plt.figure(figsize=(12, 6))
# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data['travel_time'], vert=False)
plt.title('Box Plot of Travel Time')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [58]:
import pandas as pd
# Remove rows with travel time > 4000
df_cleaned = df[df['travel_time'] <= 4000]
# Remove rows with trip fare > 1500
df_cleaned = df_cleaned[df_cleaned['trip_fare'] <= 1500]
# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [59]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df_cleaned is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df_cleaned[['travel_time', 'trip_fare']]
# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))
# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [60]:
import pandas as pd
# Find the row where the trip fare is the highest
highest_trip_fare_row = df.loc[df['trip_fare'].idxmax()]
# Display the row
print(highest_trip_fare_row)
trip_id ID42531 customer_id CUST_18770 timestamp 1546519406927.0 pick_lat 17.425972 pick_lng 78.340454 drop_lat 17.441185 drop_lng 78.391289 travel_distance 9.4 travel_time 267.825483 trip_fare 1670.0 date_time 2019-01-03 12:43:26.927000046 time 12:43:26.927000 date 2019-01-03 hour 12 Name: 42530, dtype: object
In [61]:
df
Out[61]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 | 17 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 | 17 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 | 17 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 | 17 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 | 17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 | 16 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 | 16 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 | 16 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 | 16 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 | 16 |
44587 rows × 14 columns
In [62]:
import pandas as pd
# Find all rows where the trip fare is greater than 1000
high_fare_rows = df[df['trip_fare'] > 1000]
# Display the rows
print(high_fare_rows)
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
42530 ID42531 CUST_18770 1.546519e+12 17.425972 78.340454 17.441185
drop_lng travel_distance travel_time trip_fare \
42530 78.391289 9.4 267.825483 1670.0
date_time time date hour
42530 2019-01-03 12:43:26.927000046 12:43:26.927000 2019-01-03 12
In [63]:
import pandas as pd
# Remove rows with trip fare greater than 1000
df_cleaned = df[df['trip_fare'] <= 1000]
# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [64]:
import pandas as pd
# Remove rows with travel time greater than 4000
df_cleaned = df[df['travel_time'] <= 4000]
# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [65]:
import pandas as pd
# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]
# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [66]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df_cleaned is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df_cleaned[['travel_time', 'trip_fare']]
# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))
# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [67]:
import pandas as pd
# Assuming df is your DataFrame
# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]
# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [68]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_time', 'trip_fare']]
# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))
# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [70]:
import pandas as pd
# Find all rows where the travel time is greater than 4000
long_travel_time_rows = df[df['travel_time'] > 4000]
# Display the rows
print(long_travel_time_rows)
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
10316 ID10317 CUST_7419 1.546583e+12 17.370081 78.479561 17.393438
drop_lng travel_distance travel_time trip_fare \
10316 78.492004 6.889 4134.3887 60.0
date_time time date hour
10316 2019-01-04 06:29:32.750999928 06:29:32.750999 2019-01-04 6
In [71]:
import pandas as pd
# Assuming df is your DataFrame
# Remove rows with travel time greater than 4000
df = df[df['travel_time'] <= 4000]
# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [72]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_time', 'trip_fare']]
# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))
# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [73]:
import pandas as pd
# Assuming df is your cleaned DataFrame
# Sort the DataFrame by travel_distance in descending order and select the top 7 rows
top_7_travel_distances = df.sort_values(by='travel_distance', ascending=False).head(7)
# Display the results
print(top_7_travel_distances)
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
7165 ID7166 CUST_1006 1.546790e+12 17.443495 78.390411 17.444721
18259 ID18260 CUST_11167 1.546417e+12 17.392820 78.459129 17.405748
213 ID214 CUST_208 1.546532e+12 17.366833 78.516487 17.341431
43368 ID43369 CUST_18921 1.546524e+12 17.445814 78.377289 17.448383
3305 ID3306 CUST_2780 1.546741e+12 17.406488 78.495430 17.459713
39402 ID39403 CUST_17958 1.546494e+12 17.448320 78.496551 17.448164
15481 ID15482 CUST_10018 1.546398e+12 17.455072 78.380791 17.434895
drop_lng travel_distance travel_time trip_fare \
7165 78.386597 52.801 14.164500 593.0
18259 78.443039 46.747 98.218850 521.0
213 78.528763 40.543 83.046817 445.0
43368 78.388031 32.523 31.401633 348.0
3305 78.501694 32.407 57.244633 376.0
39402 78.465332 32.123 106.770667 372.0
15481 78.368591 30.980 74.384900 363.0
date_time time date hour
7165 2019-01-06 15:53:45.615999937 15:53:45.615999 2019-01-06 15
18259 2019-01-02 08:08:35.635999918 08:08:35.635999 2019-01-02 8
213 2019-01-03 16:13:48.407999992 16:13:48.407999 2019-01-03 16
43368 2019-01-03 13:52:11.894999981 13:52:11.894999 2019-01-03 13
3305 2019-01-06 02:21:08.697000027 02:21:08.697000 2019-01-06 2
39402 2019-01-03 05:33:04.163000107 05:33:04.163000 2019-01-03 5
15481 2019-01-02 02:58:15.796000004 02:58:15.796000 2019-01-02 2
In [74]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_distance', 'travel_time', 'trip_fare']]
# Create a box plot to visualize the cleaned data
plt.figure(figsize=(18, 6))
# Travel Distance Box Plot
plt.subplot(1, 3, 1)
plt.boxplot(data_cleaned['travel_distance'], vert=False)
plt.title('Box Plot of Travel Distance (Cleaned)')
plt.xlabel('Travel Distance')
# Travel Time Box Plot
plt.subplot(1, 3, 2)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')
# Trip Fare Box Plot
plt.subplot(1, 3, 3)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')
plt.tight_layout()
plt.show()
In [75]:
df
Out[75]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 | 17 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 | 17 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 | 17 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 | 17 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 | 17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 | 16 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 | 16 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 | 16 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 | 16 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 | 16 |
44585 rows × 14 columns
In [76]:
import pandas as pd
# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]
# Remove rows with travel time greater than 4000
df = df[df['travel_time'] <= 4000]
# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
trip_id customer_id timestamp pick_lat pick_lng drop_lat \
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625
drop_lng travel_distance travel_time trip_fare \
0 78.399056 2.806 12.609667 37.0
1 78.367294 11.991 24.075200 119.0
2 78.517921 1.322 8.708300 27.0
3 78.516586 11.822 24.037550 121.0
4 78.400032 6.978 16.120867 58.0
date_time time date hour
0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
In [77]:
df.describe()
Out[77]:
| timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.458500e+04 | 44585.000000 | 44585.000000 | 44585.000000 | 44585.000000 | 44585.000000 | 44585.000000 | 44585.000000 | 44585 | 44585.000000 |
| mean | 1.546632e+12 | 17.427921 | 78.435544 | 17.427891 | 78.434897 | 5.094223 | 15.721807 | 53.836447 | 2019-01-04 20:00:45.579670784 | 9.482180 |
| min | 1.546368e+12 | 17.330339 | 78.308258 | 12.921696 | 77.548103 | -1.000000 | 0.022750 | 20.000000 | 2019-01-01 18:32:07.134999990 | 0.000000 |
| 25% | 1.546503e+12 | 17.405327 | 78.386566 | 17.405661 | 78.385010 | 2.744000 | 8.428450 | 36.000000 | 2019-01-03 08:12:45.868000 | 5.000000 |
| 50% | 1.546611e+12 | 17.432136 | 78.438866 | 17.431213 | 78.438164 | 4.299000 | 13.125017 | 46.000000 | 2019-01-04 14:08:08.160000 | 10.000000 |
| 75% | 1.546772e+12 | 17.446777 | 78.480843 | 17.446907 | 78.480255 | 6.679000 | 20.110433 | 60.000000 | 2019-01-06 10:49:36.798000128 | 13.000000 |
| max | 1.546886e+12 | 17.529791 | 78.600647 | 17.736155 | 78.634804 | 52.801000 | 962.766250 | 959.000000 | 2019-01-07 18:29:25.111000061 | 23.000000 |
| std | 1.518707e+08 | 0.030049 | 0.053331 | 0.037722 | 0.054965 | 3.365011 | 12.985863 | 32.543213 | NaN | 4.708676 |
In [78]:
import pandas as pd
# Calculate the correlation matrix
correlation_matrix = df[['trip_fare', 'travel_distance', 'travel_time']].corr()
# Display the correlation matrix
print(correlation_matrix)
trip_fare travel_distance travel_time trip_fare 1.000000 0.912258 0.618430 travel_distance 0.912258 1.000000 0.666326 travel_time 0.618430 0.666326 1.000000
In [79]:
!pip install scikit-learn
Requirement already satisfied: scikit-learn in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (1.5.2) Requirement already satisfied: numpy>=1.19.5 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (2.1.2) Requirement already satisfied: scipy>=1.6.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (1.14.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (3.5.0)
In [80]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare the data
X = df[['travel_distance']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model_distance_only = LinearRegression()
model_distance_only.fit(X_train, y_train)
# Make predictions
y_pred = model_distance_only.predict(X_test)
# Evaluate the model
mse_distance_only = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance only - Mean Squared Error: {mse_distance_only}')
Model using travel distance only - Mean Squared Error: 56.29018661135825
In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model_distance_time = LinearRegression()
model_distance_time.fit(X_train, y_train)
# Make predictions
y_pred = model_distance_time.predict(X_test)
# Evaluate the model
mse_distance_time = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance and travel time - Mean Squared Error: {mse_distance_time}')
Model using travel distance and travel time - Mean Squared Error: 56.04789395759822
In [82]:
# Convert date_time column to datetime if not already done
df['date_time'] = pd.to_datetime(df['date_time'])
# Extract the hour and minute from 'date_time' column
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare the data
X = df[['travel_distance', 'travel_time', 'hour', 'minute']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model_enhanced = LinearRegression()
model_enhanced.fit(X_train, y_train)
# Make predictions
y_pred = model_enhanced.predict(X_test)
# Evaluate the model
mse_enhanced = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance, travel time, hour, and minute - Mean Squared Error: {mse_enhanced}')
Model using travel distance, travel time, hour, and minute - Mean Squared Error: 56.080137323933485
In [84]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Make predictions
y_pred_rf = grid_search.predict(X_test)
# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Model - Mean Squared Error: {mse_rf}')
import joblib
# Assuming rf_model is your trained Random Forest model
joblib.dump(rf_model, 'rf_model.pkl')
Random Forest Model - Mean Squared Error: 32.76109310535315
Out[84]:
['rf_model.pkl']
In [85]:
import pandas as pd
import matplotlib.pyplot as plt
# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')
# Sort by number of bookings and select the top 5 customers
top_5_customers = customer_bookings.nlargest(5, 'number_of_bookings')
# Create a bar graph
plt.figure(figsize=(10, 6))
plt.bar(top_5_customers['customer_id'], top_5_customers['number_of_bookings'], color='skyblue')
plt.xlabel('Customer ID', fontsize=12)
plt.ylabel('Bookings', fontsize=12)
plt.title('Our Most Loyal Customers', fontsize=15)
plt.xticks(rotation=0)
plt.show()
In [86]:
!pip install xgboost
Requirement already satisfied: xgboost in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (2.1.1) Requirement already satisfied: numpy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from xgboost) (2.1.2) Requirement already satisfied: scipy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from xgboost) (1.14.1)
In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
# Prepare the data
X = df[['travel_distance', 'travel_time', 'hour', 'minute']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'XGBoost Model - Mean Squared Error: {mse_xgb}')
XGBoost Model - Mean Squared Error: 104.66511439298114
In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
In [89]:
# Assuming df is your DataFrame
# Ensure 'date_time' is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])
# Extract hour and minute
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
# Save the trained model
joblib.dump(rf_model, 'rf_model.pkl')
# Evaluate the model
y_pred = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print(f'Random Forest Model - Mean Squared Error: {mse_rf}')
Random Forest Model - Mean Squared Error: 31.47284183021195
In [91]:
import joblib
import pandas as pd
# Load the trained model
rf_model = joblib.load('rf_model.pkl')
# Function to predict trip fare
def predict_trip_fare(travel_distance, travel_time):
features = pd.DataFrame([[travel_distance, travel_time]], columns=['travel_distance', 'travel_time'])
predicted_fare = rf_model.predict(features)
return predicted_fare[0]
# Example usage
trip_distance = 10.0 # Replace with actual distance
travel_time = 20.0 # Replace with actual time
predicted_fare = predict_trip_fare(trip_distance, travel_time)
print(f'Predicted Trip Fare: {predicted_fare}')
Predicted Trip Fare: 75.63
In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
# Prepare the data
X = df[['travel_distance']]
y = df['trip_fare']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Save the trained model
joblib.dump(lr_model, 'lr_model.pkl')
# Evaluate the model
y_pred = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred)
print(f'Linear Regression Model - Mean Squared Error: {mse_lr}')
Linear Regression Model - Mean Squared Error: 56.29018661135825
In [93]:
import joblib
import pandas as pd
# Load the trained model
lr_model = joblib.load('lr_model.pkl')
# Function to predict trip fare
def predict_trip_fare(travel_distance):
features = pd.DataFrame([[travel_distance]], columns=['travel_distance'])
predicted_fare = lr_model.predict(features)
return predicted_fare[0]
# Example usage
trip_distance = 2.806 # Replace with actual distance
predicted_fare = predict_trip_fare(trip_distance)
print(f'Predicted Trip Fare: {predicted_fare}')
Predicted Trip Fare: 33.73111737695252
In [94]:
df
Out[94]:
| trip_id | customer_id | timestamp | pick_lat | pick_lng | drop_lat | drop_lng | travel_distance | travel_time | trip_fare | date_time | time | date | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ID001 | CUST_001 | 1.546709e+12 | 17.442705 | 78.387878 | 17.457829 | 78.399056 | 2.806 | 12.609667 | 37.0 | 2019-01-05 17:27:50.210999966 | 17:27:50.210999 | 2019-01-05 | 17 | 27 |
| 1 | ID002 | CUST_002 | 1.546709e+12 | 17.490189 | 78.415512 | 17.450548 | 78.367294 | 11.991 | 24.075200 | 119.0 | 2019-01-05 17:28:29.523999929 | 17:28:29.523999 | 2019-01-05 | 17 | 28 |
| 2 | ID003 | CUST_003 | 1.546709e+12 | 17.370108 | 78.515045 | 17.377041 | 78.517921 | 1.322 | 8.708300 | 27.0 | 2019-01-05 17:28:51.857000113 | 17:28:51.857000 | 2019-01-05 | 17 | 28 |
| 3 | ID004 | CUST_004 | 1.546709e+12 | 17.439314 | 78.443001 | 17.397131 | 78.516586 | 11.822 | 24.037550 | 121.0 | 2019-01-05 17:29:18.403000116 | 17:29:18.403000 | 2019-01-05 | 17 | 29 |
| 4 | ID005 | CUST_005 | 1.546709e+12 | 17.432325 | 78.381966 | 17.401625 | 78.400032 | 6.978 | 16.120867 | 58.0 | 2019-01-05 17:29:46.884000063 | 17:29:46.884000 | 2019-01-05 | 17 | 29 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44582 | ID44583 | CUST_19137 | 1.546531e+12 | 17.443661 | 78.391968 | 17.451042 | 78.371658 | 5.665 | 12.573883 | 55.0 | 2019-01-03 16:04:51.043999910 | 16:04:51.043999 | 2019-01-03 | 16 | 4 |
| 44583 | ID44584 | CUST_19138 | 1.546532e+12 | 17.439289 | 78.396118 | 17.449976 | 78.389160 | 9.292 | 17.937717 | 76.0 | 2019-01-03 16:05:05.868999958 | 16:05:05.868999 | 2019-01-03 | 16 | 5 |
| 44584 | ID44585 | CUST_5061 | 1.546532e+12 | 17.363689 | 78.535194 | 17.374418 | 78.529823 | 5.485 | 17.403850 | 55.0 | 2019-01-03 16:05:22.102999926 | 16:05:22.102999 | 2019-01-03 | 16 | 5 |
| 44585 | ID44586 | CUST_19139 | 1.546532e+12 | 17.401539 | 78.570076 | 17.416904 | 78.591362 | 2.326 | 14.236767 | 35.0 | 2019-01-03 16:05:36.428999901 | 16:05:36.428999 | 2019-01-03 | 16 | 5 |
| 44586 | ID44587 | CUST_15562 | 1.546532e+12 | 17.385243 | 78.479896 | 17.394102 | 78.499550 | 3.204 | 7.085400 | 37.0 | 2019-01-03 16:05:49.937999964 | 16:05:49.937999 | 2019-01-03 | 16 | 5 |
44585 rows × 15 columns
In [97]:
df.to_csv('rapido_analysis.csv', index=False)
In [ ]: